library(ggplot2)
library(tidyverse)
library(fst)
library(reshape2)
library(miceadds)
library(DescTools)


load("main_data.rdata")
load("all_sibs.rdata")

# make vector of continuing municipalities: 

kom_cont <- 
  c("101", "147", "151", "153", "155", "157", "159", "161", 
    "163", "165", "167", "169", "173", "175", "183", "185", 
    "187", "201", "217", "223", "400", "253", "269", "329", 
    "461", "563", "607", "727", "741", "751", "773", "825")

#create function for splitting income residuals in brackets


var <- c("run_kv", "elected_kv")
lab <- c("Running for \nmunicipality", 
         "Elected for \nmunicipality")

data_out <- tibble()
data_model <- tibble()

for(k in seq(1993, 2013, by = 4)){
  # remove bottom 0.1% and top 0.1% to avoid huge outliers. 
  bef <- 
    read.fst(paste("bef", k, ".fst", sep = ""))  
  
  data_an_year <- 
    data_comp %>% 
    filter(four_years == k) %>% 
    filter(.[,paste("in_year_", k, sep = "")] == 1) %>% 
    filter(inc_res < quantile(inc_res, p = 0.999) &
             inc_res > quantile(inc_res, p = 0.001) )  
  
  for(j in 1:2){
    
    data_an <- 
      left_join(data_an_year, bef, by = "PNR")
    
    full_sib <- full_sib %>% ungroup()
    
    full_sib$quant_year <- 
      unlist(full_sib[,paste0(var[j], "_", k)])
    
    sibs_year <- 
      as.data.frame(full_sib) %>% 
      group_by(fe_sibship) %>% 
      summarise(pol_fam = sum(quant_year))
    
    sibs_year <- 
      sibs_year %>% 
      filter(pol_fam > 0) 
    
    sibs_pol <- 
      full_sib %>% 
      filter(fe_sibship %in% sibs_year$fe_sibship) %>% 
      ungroup()
    
    data_an <-
      data_an %>% 
      filter(PNR %in% sibs_pol$PNR)
    
    sibs_fe <- 
      sibs_pol %>%
      select("PNR", "fe_sibship")
    
    data_sib <- 
      left_join(data_an, sibs_fe, by = "PNR") %>% 
      filter(.[,paste0(var[j], "_", k)] == 0) 
    
    data_sib_inc <-
      data_sib %>%
      group_by(fe_sibship) %>% 
      summarise(sib_inc_res = mean(inc_res, na.rm = TRUE))
    
    data_an <- 
      left_join(data_an, sibs_fe, by = "PNR") %>%
      left_join(., data_sib_inc, by = "fe_sibship") %>% 
      mutate(inc_res2 = inc_res - sib_inc_res)
      
    data_merge <- 
      data_an %>% 
      filter(.[,paste0(var[j], "_", k)] == 1) %>%
      filter(!KOM %in% kom_cont) 
    
    data_cont <- 
      data_an %>% 
      filter(.[,paste0(var[j], "_", k)] == 1) %>%
      filter(KOM %in% kom_cont)
    
    data_out_temp <-
      data_frame(year = k,
                 lab  = lab[j],
                 mean_merge = mean(data_merge$inc_res2, na.rm = TRUE),
                 mean_cont  = mean(data_cont$inc_res2, na.rm = TRUE),
                 se_merge = sd(data_merge$inc_res2, na.rm = TRUE)/sqrt(sum(!is.na(data_merge$inc_res2))),
                 se_cont  = sd(data_cont$inc_res2, na.rm = TRUE) /sqrt(sum(!is.na(data_cont$inc_res2))),
                 n_merge = sum(!is.na(data_merge$inc_res2 )),
                 n_cont = sum(!is.na(data_cont$inc_res2 )))
    
    data_out <- 
      bind_rows(data_out, data_out_temp)
    
    data_year_out <- 
      bind_rows(data_merge %>% 
                  select(c("PNR", "inc_res2", "KOM")) %>% 
                  mutate(year = k,
                         type = var[j], 
                         merg = 1),
                data_cont %>% 
                  select(c("PNR", "inc_res2", "KOM")) %>% 
                  mutate(year = k,
                         type = var[j], 
                         merg = 0))
    
    data_model <- 
      bind_rows(data_model, data_year_out)
  }
  print(k)
  print(Sys.time())
}


# find FEs for old municipalities
bef2006 <- 
  read.fst(paste("bef", 2006, ".fst", sep = ""))  
bef2007 <- 
  read.fst(paste("bef", 2007, ".fst", sep = ""))  

bef2007 <- 
  filter(bef2007, KOM != 411) %>% 
  mutate(nykom = KOM) %>% 
  select(c("PNR", "nykom"))

bef2006 <-
  left_join(bef2006, bef2007, by = "PNR") %>% 
  filter(!is.na(nykom))

nykom <- 
  bef2006 %>% 
  group_by(KOM) %>% 
  summarise(nykom = Mode(nykom, na.rm =TRUE))

data_model <- 
  left_join(data_model, nykom, by ="KOM")


# Find model estimates: 

ests_elected <- matrix(NA, nrow = 5, ncol = 4)
ests_running <- matrix(NA, nrow = 5, ncol = 4)

for(i in 1:4){
  k <- seq(2001, 2013, 4)[i]
  
  data_mod_elected <- 
    data_model %>% 
    filter((year == k - 8 | year == k) & type == "elected_kv") %>% 
    mutate(post = year == k)
  
  mod_elected  <- lm.cluster(formula = inc_res2 ~ post*merg + as.factor(nykom), 
                             data = data_mod_elected, 
                             cluster = data_mod_elected$nykom)
  
  ests_elected[1:2, i] <- summary(mod_elected)[nrow(summary(mod_elected)), 1:2]
  ests_elected[  4, i] <- nrow(data_mod_elected)
  ests_elected[  5, i] <- length(unique(data_mod_elected$PNR))
  
  # repeat for running
  
  data_mod_running <- 
    data_model %>% 
    filter((year == k - 8 | year == k) & type == "run_kv") %>% 
    mutate(post = year == k)
  
  mod_running <- lm.cluster(formula = inc_res2 ~ post*merg + as.factor(nykom), 
                            data = data_mod_running, 
                            cluster = data_mod_running$nykom)
  
  ests_running[1:2, i] <- summary(mod_running)[nrow(summary(mod_running)), 1:2]
  ests_running[  4, i] <- nrow(data_mod_running)
  ests_running[  5, i] <- length(unique(data_mod_running$PNR))
  
}


##  plot data 

plot_data <- 
  tibble()

ests <- matrix(NA, nrow = 4, ncol = 2)

for(i in 1:4){
  k <- c(1993, 1997, 2009, 2013)[i]
  
  data_mod <- 
    data_model %>% 
    filter((year == k | year == 2001) & type == "elected_kv" & merg == 1) %>% 
    mutate(post = year == k)
  
  mod_elected  <- lm.cluster(formula = inc_res2 ~ post + as.factor(nykom), 
                             data = data_mod, 
                             cluster = data_mod$nykom)
  
  ests[1, 1:2] <- summary(mod_elected)[2, 1:2]

  # repeat for continuing

  data_mod <- 
    data_model %>% 
    filter((year == k | year == 2001) & type == "elected_kv" & merg == 0) %>% 
    mutate(post = year == k)
  
  mod_elected  <- lm.cluster(formula = inc_res2 ~ post + as.factor(nykom), 
                             data = data_mod, 
                             cluster = data_mod$nykom)
  
  ests[2, 1:2] <- summary(mod_elected)[2, 1:2]
  
  # repeat for running
  
  data_mod <- 
    data_model %>% 
    filter((year == k | year == 2001) & type == "run_kv" & merg == 1) %>% 
    mutate(post = year == k)
  
  mod_elected  <- lm.cluster(formula = inc_res2 ~ post + as.factor(nykom), 
                             data = data_mod, 
                             cluster = data_mod$nykom)
  
  ests[3, 1:2] <- summary(mod_elected)[2, 1:2]
  
  # repeat for continuing
  
  data_mod <- 
    data_model %>% 
    filter((year == k | year == 2001) & type == "run_kv" & merg == 0) %>% 
    mutate(post = year == k)
  
  mod_elected  <- lm.cluster(formula = inc_res2 ~ post + as.factor(nykom), 
                             data = data_mod, 
                             cluster = data_mod$nykom)
  
  ests[4, 1:2] <- summary(mod_elected)[2, 1:2]
  
  plot_data_temp <- 
    tibble(year = rep(c(k-0.2, k + 0.2), 2),
           lab  = rep(c(2,1), each = 2),  # Elected
           variable = rep(c("merg","cont"), 2), 
           estimate = ests[, 1],
           se       = ests[, 2])
  
  plot_data <- 
    bind_rows(plot_data, plot_data_temp)
  
}

# add 2001 data

plot_data_2001 <- 
  plot_data_temp %>% 
  mutate(year = year - 12,
         estimate = 0,
         se = 0)
 
plot_data <- 
  bind_rows(plot_data, plot_data_2001)


plot_data <- 
  mutate(plot_data, 
         lab = as.factor(lab),
         lab = fct_recode(lab,
                          "Elected for \nmunicipality" = "2",
                          "Running for \nmunicipality" = "1"),
         place = case_when(year == 1992.8 ~ 0.9,
                           year == 1993.2 ~ 1.1,
                           year == 1996.8 ~ 1.9,
                           year == 1997.2 ~ 2.1,
                           year == 2000.8 ~ 2.9,
                           year == 2001.2 ~ 3.1,
                           year == 2008.8 ~ 3.9,
                           year == 2009.2 ~ 4.1,
                           year == 2012.8 ~ 4.9,
                           year == 2013.2 ~ 5.1,
         )) 


         
         
plot <- 
  ggplot(data = plot_data,
         aes(x = place,
             y = estimate,
             ymin = estimate + qnorm(0.025) * se,
             ymax = estimate + qnorm(0.975) * se,
             alpha = variable)) + 
  facet_grid(lab ~ .) +
  geom_errorbar(width = 0) + 
  geom_point() + 
  theme_classic() +
  scale_y_continuous("Income Score") + 
  scale_x_continuous("Year", breaks = 1:5, labels = c(1993, 1997, 2001, 2009, 2013)) +
  geom_vline(xintercept = 3, linetype = "dashed", alpha = 0.5) + 
  scale_alpha_discrete("", range = c(0.25, 1), 
                       labels = c("Continuing \nmunicipalities",
                                  "Amalgamated \nmunicipalities")) +
  theme(panel.spacing = unit(2, "lines"))